In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
In [3]:
# Load data
diabetes = pd.read_csv("/Users/surekhadhulipalla/Desktop/diabetes.csv")
# Select features and target variable
X = diabetes[['Pregnancies', 'Age', 'Glucose']]
y = diabetes['Outcome']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Define and train the Random Forest model
diabetes_clf = RandomForestClassifier(n_estimators=50, random_state=42)
diabetes_clf.fit(X_train, y_train)
# Make predictions
y_pred = diabetes_clf.predict(X_test)
# Evaluate the model
CM = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", CM)
AS = accuracy_score(y_test, y_pred)
print("Accuracy Score:", AS)
CR = classification_report(y_test, y_pred)
print("Classification Report:\n", CR)
Confusion Matrix:
[[122 29]
[ 36 44]]
Accuracy Score: 0.7186147186147186
Classification Report:
precision recall f1-score support
0 0.77 0.81 0.79 151
1 0.60 0.55 0.58 80
accuracy 0.72 231
macro avg 0.69 0.68 0.68 231
weighted avg 0.71 0.72 0.72 231
In [5]:
from sklearn.tree import export_graphviz
import graphviz
In [7]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[7]:
RandomForestClassifier(n_estimators=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [9]:
len(rf.estimators_)
Out[9]:
1
In [11]:
from sklearn import tree
X=diabetes[['Pregnancies','Age','Glucose']]
y = diabetes['Outcome']
plt.figure(figsize=(95,50))
_=tree.plot_tree(rf.estimators_[0], filled=True,fontsize=10)
In [57]:
# Caluculate running time
import time
start_time = time.time()
diabetes_clf = RandomForestClassifier(n_estimators=50, random_state=42)
diabetes_clf.fit(X_train, y_train)
end_time = time.time()
non_parallel_time= end_time - start_time
non_parallel_time
Out[57]:
0.07692289352416992
In [19]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
import dask.dataframe as dd
from dask_ml.metrics import accuracy_score
In [21]:
# Load dataset
diabetes = pd.read_csv("/Users/surekhadhulipalla/Desktop/diabetes.csv")
diabetes.head
Out[21]:
<bound method NDFrame.head of Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
.. ... ... ... ... ... ...
763 10 101 76 48 180 32.9
764 2 122 70 27 0 36.8
765 5 121 72 23 112 26.2
766 1 126 60 0 0 30.1
767 1 93 70 31 0 30.4
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1
.. ... ... ...
763 0.171 63 0
764 0.340 27 0
765 0.245 30 0
766 0.349 47 1
767 0.315 23 0
[768 rows x 9 columns]>
In [23]:
# Load your dataset into a Dask dataframe
diabetes_df = dd.read_csv("/Users/surekhadhulipalla/Desktop/diabetes.csv")
diabetes_df.head()
Out[23]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
In [25]:
X=diabetes_df[['Pregnancies','Age','Glucose']]
y = diabetes_df['Outcome']
y
Out[25]:
Dask Series Structure:
npartitions=1
int64
...
Dask Name: getitem, 3 expressions
Expr=ArrowStringConversion(frame=FromMapProjectable(c1c5de6))['Outcome']
In [27]:
# import packages
from dask.distributed import Client
from sklearn.metrics import accuracy_score
# Initialize Dask Client
client = Client(n_workers=4)
# Define features (X) and target (y)
X = diabetes[['Pregnancies','Age','Glucose']]
y = diabetes['Outcome']
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
# Model Training
dask_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
dask_model.fit(X_train, y_train)
# Predictions
y_pred = dask_model.predict(X_test)
# Accuracy Calculation
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
# Close Dask Client
client.close()
/opt/anaconda3/lib/python3.12/site-packages/distributed/node.py:187: UserWarning: Port 8787 is already in use. Perhaps you already have a cluster running? Hosting the HTTP server on port 51009 instead warnings.warn(
Model Accuracy: 0.7186147186147186
In [29]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[29]:
RandomForestClassifier(n_estimators=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [31]:
RandomForestClassifier
RandomForestClassifier(n_estimators=1)
Out[31]:
RandomForestClassifier(n_estimators=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [33]:
len(rf.estimators_)
Out[33]:
1
In [55]:
from sklearn import tree
X=diabetes[['Pregnancies','Age','Glucose']]
y = diabetes['Outcome']
plt.figure(figsize=(95,50))
_=tree.plot_tree(rf.estimators_[0], filled=True,fontsize=12)
In [59]:
start_time = time.time()
dask_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
dask_model.fit(X_train, y_train)
end_time = time.time()
parallel_time= end_time - start_time
print(parallel_time)
0.0440981388092041
In [61]:
# Plot results of running time before and after parallelization
plt.bar(['Non-Parallel', 'Parallel (Dask)'], [non_parallel_time, parallel_time],color='blue')
plt.ylabel("Time (seconds)")
plt.title("Execution Time Comparison")
plt.show()
In [ ]:
In [ ]:
In [ ]: